notebook.community

Edit and run



In [1]:

    
%pylab
%matplotlib inline









    



Using matplotlib backend: agg
Populating the interactive namespace from numpy and matplotlib



In [2]:

    
cd ..









    



/home/scott/Documents/git/neukrill-net-work



In [7]:

    
import sys
import numpy as np
import skimage
import cv2
import sklearn



In [4]:

    
from IPython.display import display
from IPython.display import Image
from IPython.display import HTML



In [5]:

    
import matplotlib.pyplot as plt
import matplotlib.image as mpimg



In [6]:

    
import neukrill_net.utils as utils
import neukrill_net.image_processing as image_processing
import neukrill_net.bagofwords as bagofwords



In [8]:

    
# Load the settings 
settings = utils.Settings('settings.json')



In [9]:

    
rawdata, labels = utils.load_rawdata(settings.image_fnames, classes=settings.classes)



In [10]:

    
label_encoder = sklearn.preprocessing.LabelEncoder()
y = label_encoder.fit_transform(labels)



In [11]:

    
bow_options = {'verbose':True, 'n_features_max':100, 'patch_size':15, 'clusteralgo':'kmeans', 'n_clusters':20, 'random_seed':42}



In [23]:

    
bow = bagofwords.Bow(normalise_hist=False, **bow_options)



In [13]:

    
sample = np.random.random_integers(0, len(rawdata)-1, size=(1000))



In [24]:

    
bow.build_vocabulary([rawdata[i] for i in sample])









    



Describing the keypoints of 1000 images
Clustering patch descriptors to form vocabulary

Visualising vocabulary clustering of keypoints

Setup



In [37]:

    
# For each image, get all keypoint descriptions
dscdata = [bow.describeImage(rawdata[i]) for i in sample]

# Remove empty descriptions from images without any keypoints
dscdata = [x for x in dscdata if x is not None]

# Flatten so we have all keypoints from all images on top of each other
dscdata = np.vstack(dscdata)

dscclass = bow.cluster.predict(dscdata)

PCA



In [55]:

    
reduced_data = sklearn.decomposition.PCA(n_components=3).fit_transform(dscdata)



In [56]:

    
plt.scatter(reduced_data[:,0], reduced_data[:,1], c=dscclass, alpha=0.5, linewidths=0, s=1)
plt.show()
plt.scatter(reduced_data[:,0], reduced_data[:,2], c=dscclass, alpha=0.5, linewidths=0, s=1)
plt.show()
plt.scatter(reduced_data[:,1], reduced_data[:,2], c=dscclass, alpha=0.5, linewidths=0, s=1)
plt.show()

ICA



In [53]:

    
reduced_data = sklearn.decomposition.FastICA(n_components=3).fit_transform(dscdata)









    



/home/scott/Documents/git/neukrill-venv3/lib/python3.4/site-packages/sklearn/decomposition/fastica_.py:282: DeprecationWarning: Implicitly casting between incompatible kinds. In a future numpy release, this will raise an error. Use casting="unsafe" if this is intentional.
  X -= X_mean[:, np.newaxis]



In [54]:

    
plt.scatter(reduced_data[:,0], reduced_data[:,1], c=dscclass, alpha=0.5, linewidths=0, s=1)
plt.show()
plt.scatter(reduced_data[:,0], reduced_data[:,2], c=dscclass, alpha=0.5, linewidths=0, s=1)
plt.show()
plt.scatter(reduced_data[:,1], reduced_data[:,2], c=dscclass, alpha=0.5, linewidths=0, s=1)
plt.show()

Factor Analysis



In [57]:

    
reduced_data = sklearn.decomposition.FactorAnalysis(n_components=3).fit_transform(dscdata)



In [58]:

    
plt.scatter(reduced_data[:,0], reduced_data[:,1], c=dscclass, alpha=0.5, linewidths=0, s=1)
plt.show()
plt.scatter(reduced_data[:,0], reduced_data[:,2], c=dscclass, alpha=0.5, linewidths=0, s=1)
plt.show()
plt.scatter(reduced_data[:,1], reduced_data[:,2], c=dscclass, alpha=0.5, linewidths=0, s=1)
plt.show()

Training with Logistic Regression



In [27]:

    
X = [bow.compute_image_bow(img) for img in rawdata]
X = np.vstack(X)



In [16]:

    
cv = sklearn.cross_validation.StratifiedShuffleSplit(y)



In [17]:

    
clf = sklearn.linear_model.LogisticRegression()



In [28]:

    
X[:10]









    Out[28]:





array([[  1.,   0.,   3.,   1.,   7.,   0.,   0.,   0.,   0.,   0.,   2.,
          0.,   2.,   0.,   1.,   4.,   1.,   0.,   2.,   0.],
       [  2.,   0.,   2.,   3.,  11.,   3.,   1.,   3.,   2.,   0.,   6.,
          0.,   5.,   1.,   2.,   6.,   1.,   1.,   4.,   6.],
       [  1.,   0.,   1.,   1.,   9.,   0.,   1.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,  11.,   0.,   0.,   3.,   0.],
       [  1.,   0.,   2.,   0.,   1.,   0.,   0.,   0.,   2.,   0.,   2.,
          0.,   0.,   2.,   5.,  10.,   6.,   0.,   3.,   2.],
       [  2.,   0.,   0.,   3.,   5.,   4.,   0.,   3.,   0.,   0.,   7.,
          0.,   1.,   1.,   1.,  15.,   1.,   0.,   1.,   0.],
       [  1.,   0.,   0.,   0.,   1.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   1.,   0.,   0.,   5.,   2.,   0.,   0.,   0.],
       [  9.,   0.,   0.,   0.,   7.,   0.,   0.,   9.,   0.,   0.,   1.,
          0.,   0.,   2.,   1.,   5.,   0.,   0.,   3.,   1.],
       [  8.,   0.,   0.,   1.,   7.,   0.,   0.,   3.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   3.,   1.,   0.,   6.,   1.],
       [  0.,   0.,   2.,   1.,   6.,   1.,   0.,   0.,   0.,   0.,   0.,
          0.,   2.,   0.,   0.,  10.,   1.,   0.,   7.,   1.],
       [  4.,   0.,   3.,   3.,   5.,   3.,   2.,   5.,   0.,   0.,   4.,
          0.,   0.,   0.,   0.,   5.,   0.,   0.,   1.,   0.]])



In [26]:

    
bow.compute_image_bow(rawdata[0])









    Out[26]:





array([1, 0, 3, 1, 7, 0, 0, 0, 0, 0, 2, 0, 2, 0, 1, 4, 1, 0, 2, 0])



In [29]:

    
print('Cross-validating')
results = []
for train, test in cv:
    # Make a new BOW encoding
    #bow = bagofwords.Bow(**bow_options)
    #bow.build_vocabulary([rawdata[i] for i in train])
    #X = [bow.compute_image_bow(img) for img in rawdata]

    clf.fit(X[train], y[train])
    p = clf.predict_proba(X[test])
    res = sklearn.metrics.log_loss(y[test], p)
    print(res)
    results.append(res)









    



Cross-validating
2.4409929416
2.46779735961
2.41937293395
2.47116204372
2.45700342345
2.44456435296
2.46251010289
2.4428022459
2.43156976094
2.45793841621

Try to predict classes of test data



In [30]:

    
print('Fitting clf to all training data')
clf.fit(X,y)









    



Fitting clf to all training data
Loading the raw test data
Bagging words for raw test data



In [ ]:

    
print('Loading the raw test data')
rawtest, names = utils.load_rawdata(settings.image_fnames)



In [34]:

    
print('Bagging words for raw test data')
X2 = [bow.compute_image_bow(img) for img in rawtest]
X2 = np.vstack(X2)
p = clf.predict_proba(X2)









    



Bagging words for raw test data



In [32]:

    
len(settings.image_fnames['test'])









    Out[32]:





130400



In [33]:

    
X2.shape









    Out[33]:





(30336, 20)

Visualise Logistic Regression classes

http://scikit-learn.org/stable/auto_examples/plot_classifier_comparison.html#example-plot-classifier-comparison-py